library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'lubridate' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## get my transliteration table (I tried to guess the PsycInfo ASCII name from the PsycTESTS name)
translit <- readRDS("raw_data/psycinfo_psyctests_names.rds")

## get our first scrape (by journal, checking counts for each year in each journal for top tests)
psycinfo_scrape_by_journal <- read_tsv('raw_data/merged_table_all.tsv') %>% 
  drop_na(Name) %>% 
  # this tsv can be found in "Scraping-EBSCO-Host\data\merged tables"
#  mutate(Name = toTitleCase(Name)) %>% 
  rename(usage_count = "Hit Count") %>% 
  group_by(Name, Year) %>% 
  summarise(usage_count = sum(usage_count))
## Rows: 309223 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Name, Journal
## dbl (3): Hit Count, Year, number of search results
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'Name'. You can override using the `.groups` argument.
## get our second scrape (by test DOI and year)
overview <- readr::read_tsv("raw_data/20230617_ebsco_scrape_clean_overview_table_1.tsv")
## Rows: 71692 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): DOI
## dbl (3): first_pub_year, last_pub_year, Hits
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
byyear <- readr::read_tsv("raw_data/20230617_ebsco_scrape_table_years_1.tsv")
## Rows: 218142 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): DOI
## dbl (2): Year, Hits
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
byyear %>% group_by(DOI) %>% summarise(Hits = sum(Hits, na.rm=T)) %>% pull(Hits) %>% table()
## .
##     0     1     2     3     4     5     6     7     8     9    10    11    12 
##    27 13280  4107  2140  1487  1077   864   645   570   464   375   375   285 
##    13    14    15    16    17    18    19    20    21    22    23    24    25 
##   243   237   220   168   180   163   114   141   132   102   108   113   108 
##    26    27    28    29    30    31    32    33    34    35    36    37    38 
##    83    91    72    86    88    68    81    77    68    61    45    56    48 
##    39    40    41    42    43    44    45    46    47    48    49    50    51 
##    42    60    48    37    45    38    42    41    34    29    29    33    35 
##    52    53    54    55    56    57    58    59    60    61    62    63    64 
##    26    31    25    21    22    32    19    37    26    23    18    24    16 
##    65    66    67    68    69    70    71    72    73    74    75    76    77 
##    25    19    19    22    19    27    18    18    11    12    12    16    11 
##    78    79    80    81    82    83    84    85    86    87    88    89    90 
##    15    22    16    14    10    13    16    10    13     6    10    13    11 
##    91    92    93    94    95    96    97    98    99   100   101   102   103 
##    10     8    13    14    11    10    17    12    11    10    13    12     6 
##   104   105   106   107   108   109   110   111   112   113   114   115   116 
##     8     8    13     9    13     8     6     9     6     7     8     4     5 
##   117   118   119   120   121   122   123   124   125   126   127   128   129 
##     5    13     8     7     7     6    10     9     7     3    13     4     4 
##   130   131   132   133   134   135   136   137   138   139   140   141   142 
##    11     6     4     3     6     5     7     3     6     4     3     8     7 
##   143   144   145   146   147   148   149   150   151   152   153   154   155 
##     9     9     4     8     3     9     4     7     9     6     5     5     3 
##   156   157   158   159   160   161   162   163   164   165   166   167   168 
##     6     5     5     5     4     6     3     3     4     3     3     5     1 
##   169   170   171   172   173   174   175   176   177   178   179   180   181 
##     2     5     3     3     3     3     5     2     2     2     4     8     5 
##   182   183   184   185   186   187   189   190   191   192   193   194   195 
##     4     4     6     5     2     1     3     5     6     1     6     4     5 
##   196   197   198   199   200   201   202   203   204   205   206   207   208 
##     4     4     1     1     3     3     5     1     3     3     3     5     2 
##   209   210   211   212   213   214   215   216   218   219   220   221   222 
##     5     3     7     1     3     4     2     3     4     3     3     4     1 
##   223   224   225   226   227   228   230   231   233   234   235   236   237 
##     2     6     4     1     1     3     1     4     2     3     2     2     1 
##   238   239   240   241   242   244   245   246   247   248   249   251   252 
##     1     4     6     2     1     1     4     4     1     1     1     2     1 
##   254   255   256   257   258   259   260   262   263   264   266   267   268 
##     1     2     3     1     2     3     3     4     3     1     1     2     1 
##   269   270   271   272   274   275   276   278   279   280   282   283   284 
##     2     2     1     3     3     1     2     4     4     2     2     2     2 
##   285   286   287   288   290   291   292   293   294   295   296   297   298 
##     2     1     2     1     1     2     1     3     3     1     2     2     2 
##   299   300   304   305   307   308   309   311   312   313   314   315   316 
##     3     1     1     1     1     4     1     1     1     1     1     3     2 
##   318   319   320   322   324   325   326   327   329   330   331   332   333 
##     1     3     4     2     1     2     1     1     2     1     2     4     1 
##   334   337   338   339   341   342   346   347   348   349   353   358   359 
##     1     1     1     1     1     1     2     1     1     1     1     3     2 
##   361   363   364   367   368   371   372   376   377   379   380   384   387 
##     2     1     2     1     1     2     1     1     2     1     1     2     2 
##   389   392   393   394   396   397   398   400   401   405   407   408   411 
##     1     1     1     1     1     2     1     2     1     2     2     1     1 
##   414   415   418   419   423   424   428   429   430   431   436   437   438 
##     1     1     1     1     1     1     1     1     1     2     1     1     2 
##   441   443   445   446   451   452   456   460   462   464   466   470   483 
##     3     2     1     2     1     1     1     1     1     1     2     1     1 
##   485   486   488   491   495   499   500   504   512   518   519   520   528 
##     1     1     1     1     1     1     3     1     1     1     1     1     2 
##   529   532   534   535   537   538   539   540   542   544   545   546   550 
##     1     1     1     1     1     1     1     1     1     2     1     1     1 
##   553   554   556   561   562   568   569   570   574   577   584   585   589 
##     1     1     1     1     1     1     1     1     2     1     1     1     1 
##   595   597   598   600   601   603   604   623   626   627   631   632   633 
##     1     1     1     1     1     1     1     1     1     1     2     1     1 
##   639   642   656   658   660   661   662   669   671   675   677   678   679 
##     1     2     1     1     1     1     1     1     1     1     1     1     1 
##   682   686   688   696   698   700   709   710   712   714   716   718   720 
##     1     1     1     1     1     1     1     1     1     1     2     2     1 
##   722   724   725   727   728   730   732   733   755   761   762   764   772 
##     1     1     1     1     2     1     1     1     1     1     1     1     1 
##   773   780   783   794   796   800   808   812   813   816   819   825   840 
##     1     1     1     1     2     1     2     1     1     2     1     1     1 
##   844   845   847   848   849   856   862   871   886   891   908   911   915 
##     1     1     2     1     1     1     1     1     1     1     2     1     1 
##   919   928   933   934   935   950   959   969   973   974   981   988   992 
##     1     1     2     1     2     1     1     2     2     1     1     1     1 
##   993  1009  1015  1018  1043  1071  1074  1077  1119  1121  1131  1135  1161 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
##  1163  1172  1173  1181  1184  1219  1224  1247  1251  1253  1255  1267  1296 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
##  1300  1323  1340  1378  1380  1392  1395  1399  1402  1429  1470  1479  1487 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
##  1519  1521  1553  1562  1569  1579  1642  1648  1688  1748  1772  1825  1868 
##     2     1     1     1     1     1     1     1     1     1     1     1     1 
##  1901  1932  1937  2052  2065  2074  2102  2121  2130  2132  2149  2200  2254 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
##  2304  2352  2584  2678  2700  2847  3053  3067  3134  3157  3487  3500  3637 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
##  3675  3750  3790  4041  4096  4410  4484  4876  4888  5147  6257  6313  6365 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
##  6408  6494  7023  7095  7238  7504  7597  8420  8513  8709  9492 10896 12134 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
## 13316 14268 18484 25118 
##     1     1     1     1
one_hit_wonders <- overview %>% filter(Hits == 1) %>% 
  mutate(Year = first_pub_year) %>% 
  mutate(Hits = coalesce(Hits, 1))
# for some few, the call was repeated by year for some reason
one_hit_wonders %>% select(DOI, first_pub_year) %>% inner_join(byyear, by = "DOI") %>% arrange(DOI)
byyear <- byyear %>% anti_join(one_hit_wonders, by = "DOI")

psycinfo_by_doi <- one_hit_wonders %>% 
  select(DOI, Year, Hits) %>% 
  bind_rows(byyear) %>% 
  left_join(overview %>% rename(total_hits = Hits), by = "DOI")


## don't use tests with names that occur many times
dupe_names <- translit %>% group_by(name_psycinfo) %>% filter(n() > 1) %>% ungroup()
translit <- translit %>% group_by(name_psycinfo) %>% 
  mutate(non_unique_name = n() > 1) %>% 
  filter(row_number() == 1) %>% ungroup()

# merge it all
psycinfo <- psycinfo_by_doi %>% 
  full_join(translit %>% select(DOI, name_psycinfo, NameOC), by = "DOI") %>% 
  full_join(psycinfo_scrape_by_journal, by = c("name_psycinfo" = "Name", "Year")) %>% 
  rename(hits_scrape_1 = usage_count,
         hits_scrape_2 = Hits,
         total_hits_scrape_2 = total_hits) %>% 
  group_by(name_psycinfo) %>% 
  mutate(total_hits_scrape_1 = sum(hits_scrape_1))
psycinfo %>% is.na() %>% colSums()
##                 DOI                Year       hits_scrape_2      first_pub_year 
##               96747               39022              135768              135768 
##       last_pub_year total_hits_scrape_2       name_psycinfo              NameOC 
##              135768              135768                3079               99825 
##       hits_scrape_1 total_hits_scrape_1 
##              218121              265989
## aggregate it all
psycinfo_overall <- psycinfo %>% 
  group_by(name_psycinfo) %>% 
  summarise(total_hits_scrape_1 = sum(hits_scrape_1, na.rm = T),
         total_hits_scrape_2 = sum(hits_scrape_2, na.rm = T)) %>% 
  left_join(translit %>% select(DOI, name_psycinfo))
## Joining with `by = join_by(name_psycinfo)`
## correlate totals
cor.test(psycinfo_overall$total_hits_scrape_1, psycinfo_overall$total_hits_scrape_2)
## 
##  Pearson's product-moment correlation
## 
## data:  psycinfo_overall$total_hits_scrape_1 and psycinfo_overall$total_hits_scrape_2
## t = 249.62, df = 104320, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6076982 0.6152964
## sample estimates:
##       cor 
## 0.6115114
psycinfo_overall %>% 
  filter(total_hits_scrape_1 > 0, total_hits_scrape_2 > 0) %>% 
  summarise(cor(total_hits_scrape_1, total_hits_scrape_2))
## correlate by year, diffs, proportions
cor.test(psycinfo$hits_scrape_1, psycinfo$hits_scrape_2)
## 
##  Pearson's product-moment correlation
## 
## data:  psycinfo$hits_scrape_1 and psycinfo$hits_scrape_2
## t = 467.52, df = 39014, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9196473 0.9226533
## sample estimates:
##      cor 
## 0.921164
psycinfo %>%  mutate(diff = hits_scrape_2 - hits_scrape_1) %>% pull(diff) %>% abs() %>% mean(na.rm=T)
## [1] 12.3914
psycinfo %>%  mutate(prop = hits_scrape_2/ hits_scrape_1) %>% pull(prop) %>%  qplot() + scale_x_log10()
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 318095 rows containing non-finite values (`stat_bin()`).

psycinfo %>%  mutate(diff = hits_scrape_2 - hits_scrape_1) %>% pull(diff) %>%  mean(na.rm=T)
## [1] 11.99798
# psycinfo %>% filter(hits_scrape_1 > hits_scrape_2) %>% select(DOI, Year, name_psycinfo, NameOC, hits_scrape_1, hits_scrape_2) %>% mutate(diff = hits_scrape_2 - hits_scrape_1) %>% arrange(diff) %>% View()

psycinfo %>% filter(hits_scrape_1 < hits_scrape_2) %>% nrow()
## [1] 27545
psycinfo %>%  mutate(diff = hits_scrape_2 - hits_scrape_1) %>% pull(diff) %>% table() %>% sort()
## .
## -165 -143  -99  -98  -84  -81  -73  -50  -43  -41  -39  -35  -31  -27  -23  -21 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  -19  -15  -13  140  143  154  179  182  186  190  195  206  216  226  228  233 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  239  241  243  246  248  250  253  257  258  260  262  263  265  268  269  274 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  278  281  284  285  287  290  293  294  298  301  302  307  311  312  313  316 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  319  325  326  327  328  332  334  335  337  340  344  347  350  351  355  358 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  359  365  376  379  381  383  394  396  398  400  404  406  410  413  414  416 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  417  418  421  428  429  430  432  433  434  437  439  441  443  446  449  460 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  462  466  474  490  493  495  496  502  510  511  512  516  526  531  539  553 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  562  563  567  571  577  586  590  602  604  613  633  639  640  644  655  659 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  661  683  691  700  701  704  714  736  765  771  775  791  804  806  828  854 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  858  865  879  919  950  955  965  966  976 1004 1005 1265 1335 1591  -96  -17 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    2    2 
##  -16  -12  105  119  135  136  141  153  157  159  160  162  164  165  167  169 
##    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2 
##  172  173  174  180  183  191  197  198  200  207  211  217  218  225  232  235 
##    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2 
##  236  238  244  256  261  267  270  272  273  282  288  295  304  305  306  317 
##    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2 
##  318  322  339  342  346  349  352  369  373  375  380  385  392  407  408  431 
##    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2 
##  436  438  440  450  456  548  680  -11   -9   99  118  138  146  158  171  177 
##    2    2    2    2    2    2    2    3    3    3    3    3    3    3    3    3 
##  178  185  189  192  196  199  202  204  205  208  215  219  220  222  223  234 
##    3    3    3    3    3    3    3    3    3    3    3    3    3    3    3    3 
##  247  254  264  275  279  286  297  303  309  329  336  356  367  374  382  537 
##    3    3    3    3    3    3    3    3    3    3    3    3    3    3    3    3 
##  -14  -10  132  134  142  144  145  148  150  163  170  176  187  188  193  194 
##    4    4    4    4    4    4    4    4    4    4    4    4    4    4    4    4 
##  224  255  366   -8   97  126  129  139  149  152  155  156  161  166  168  175 
##    4    4    4    5    5    5    5    5    5    5    5    5    5    5    5    5 
##  181  184  209  229  231  107  109  111  116  125  130  151  103  104  113  117 
##    5    5    5    5    5    6    6    6    6    6    6    6    7    7    7    7 
##  120  121  127  133  137  147   89  115  123   -7  110  112  114  124  131   86 
##    7    7    7    7    7    7    8    8    8    9    9    9    9    9    9   10 
##   90   92  100  101  108  122  128   74   87   88   95   96  102   91   93   94 
##   10   10   10   10   10   10   10   11   11   11   11   11   11   12   12   12 
##  106   77   85   98   63   82   72   80   81   83   84   76   70   73   79   -6 
##   12   13   14   14   15   15   16   16   16   16   16   17   18   18   19   20 
##   65   64   69   75   78   60   68   71   66   67   62   -5   59   61   55   57 
##   20   21   21   22   23   24   24   24   26   28   29   30   31   32   33   33 
##   58   50   53   56   46   52   54   43   40   48   51   49   45   42   44   41 
##   34   35   35   39   43   45   45   46   47   47   48   50   52   55   58   59 
##   47   39   38   -4   37   34   36   35   33   32   30   29   31   27   28   26 
##   61   62   69   70   76   77   77   85   96  101  102  107  108  112  123  142 
##   25   23   -3   24   22   21   20   19   18   17   16   15   14   13   12   11 
##  157  163  165  166  183  200  225  248  268  285  311  357  383  431  544  591 
##   -2   10    9    8    7    6    5    4    3    2   -1    1    0 
##  615  688  765  933 1066 1228 1589 2033 2638 3487 3759 4918 6757
# psycinfo %>% filter(hits_scrape_1 < hits_scrape_2) %>% select(DOI, Year, name_psycinfo, NameOC, hits_scrape_1, hits_scrape_2) %>% mutate(diff = hits_scrape_2 - hits_scrape_1) %>% arrange(diff) %>% View()

Top Tests in each

Only in PsycInfo Scrape 1

psycinfo_overall %>% 
  ungroup() %>% 
  filter(total_hits_scrape_1 > 0,
         total_hits_scrape_2 == 0) %>% 
  summarise(n(), sum(total_hits_scrape_1), sum(total_hits_scrape_1)/n())
options(cols.min.print = 2, cols.print = 2)
psycinfo_overall %>% 
  ungroup() %>% 
  # filter(is.na(DOI)) %>%
  filter(total_hits_scrape_2 == 0, total_hits_scrape_1 >= 1) %>% 
  arrange(desc(total_hits_scrape_1)) %>% 
  select(name_psycinfo, total_hits_scrape_1) %>% 
  arrange(desc(total_hits_scrape_1)) %>% 
  DT::datatable()
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

Only in PsycTests Scrape 2

psycinfo_overall %>% 
  ungroup() %>% 
  filter(total_hits_scrape_1 == 0,
         total_hits_scrape_2 > 0) %>% 
  summarise(n(), sum(total_hits_scrape_2), sum(total_hits_scrape_2)/n())
psycinfo_overall %>% 
  ungroup() %>% 
  filter(total_hits_scrape_1 == 0, total_hits_scrape_2 >= 1) %>% 
  # filter(!is.na(DOI), is.na(total_hits_scrape_1) | total_hits_scrape_1 == 0) %>% 
  drop_na(name_psycinfo, total_hits_scrape_2) %>% 
  arrange(desc(total_hits_scrape_2)) %>% 
  select( name_psycinfo, total_hits_scrape_2) %>% 
  DT::datatable()
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

Hits only in scrape 1, even though we have a match for the name

psycinfo_overall %>% 
  ungroup() %>% 
  filter(!is.na(DOI),
         total_hits_scrape_1 > 0,
         total_hits_scrape_2 == 0) %>% 
  summarise(n(), sum(total_hits_scrape_1), sum(total_hits_scrape_1)/n())

Hits only in scrape 1 without a clear match for the name

psycinfo_overall %>% 
  ungroup() %>% 
  filter(is.na(DOI),
         total_hits_scrape_1 > 0,
         total_hits_scrape_2 == 0) %>% 
  summarise(n(), sum(total_hits_scrape_1), sum(total_hits_scrape_1)/n())

Merge Scrape 1 and 2

psycinfo_scrape_1_without_hits_in_2 <- psycinfo_overall %>% 
    ungroup() %>% 
    filter(total_hits_scrape_1 > 0, is.na(total_hits_scrape_2) | total_hits_scrape_2 == 0) %>% 
    select(DOI, name_psycinfo) %>% 
    distinct(name_psycinfo, .keep_all = TRUE) %>% 
    left_join(psycinfo_scrape_by_journal %>% 
     rename(name_psycinfo = Name, Hits = usage_count), by = "name_psycinfo", multiple = "all") %>% 
    mutate(DOI = coalesce(DOI, name_psycinfo)) %>% 
    group_by(DOI) %>% 
    mutate(first_pub_year = min(Year, na.rm = T),
           last_pub_year = max(Year, na.rm = T),
           total_hits = sum(Hits, na.rm = T)) %>% 
  ungroup()

psycinfo_scrape_1_without_hits_in_2 %>% 
  summarise(n_distinct(DOI), sum(Hits), sum(Hits)/n_distinct(DOI))
psycinfo_by_doi_with_hits <- psycinfo_by_doi %>%
  drop_na(Hits, Year) %>% 
  anti_join(psycinfo_overall %>% filter(total_hits_scrape_2 == 0) %>% select(DOI), by = "DOI") %>% 
  left_join(translit %>% select(DOI, name_psycinfo), by = "DOI")
sum(is.na(psycinfo_by_doi_with_hits$name_psycinfo))
## [1] 3078
sum(!is.na(psycinfo_by_doi_with_hits$name_psycinfo))
## [1] 215037
psycinfo_by_doi_with_hits %>% 
  summarise(n_distinct(DOI), sum(Hits, na.rm = T), sum(Hits, na.rm = T)/n_distinct(DOI))
psycinfo_merged <- bind_rows(
  scrape_2 = psycinfo_by_doi_with_hits, 
  scrape_1 = psycinfo_scrape_1_without_hits_in_2, .id = "source")

psycinfo_merged %>% 
  summarise(n_distinct(DOI), sum(Hits, na.rm = T), sum(Hits, na.rm = T)/n_distinct(DOI))
saveRDS(psycinfo_merged, "raw_data/psycinfo_merged_scrape_1_and_2.rds")

Joint top list

psycinfo_merged %>% 
  group_by(DOI, name_psycinfo, source) %>%
  summarise(total_hits = sum(Hits, na.rm  = T)) %>% 
  arrange(desc(total_hits)) %>% 
  ungroup() %>% 
  select( source, name_psycinfo, total_hits) %>% 
  DT::datatable()
## `summarise()` has grouped output by 'DOI', 'name_psycinfo'. You can override
## using the `.groups` argument.
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html